Research question

1.a) Does terrain characteristics (eg. exposition, slope) correlate with land cover, ie. are there major differences between single land covers?

Libraries

library(terra)
## terra 1.7.78
library(raster)
## Warning: package 'raster' was built under R version 4.4.1
## Cargando paquete requerido: sp
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
## Cargando paquete requerido: ggplot2
## 
## Adjuntando el paquete: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Cargando paquete requerido: lattice
library(data.table)
## 
## Adjuntando el paquete: 'data.table'
## The following object is masked from 'package:raster':
## 
##     shift
## The following object is masked from 'package:terra':
## 
##     shift
library(sf)
## Warning: package 'sf' was built under R version 4.4.1
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:raster':
## 
##     intersect, select, union
## The following objects are masked from 'package:terra':
## 
##     intersect, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(vcd)
## Warning: package 'vcd' was built under R version 4.4.1
## Cargando paquete requerido: grid
## 
## Adjuntando el paquete: 'grid'
## The following object is masked from 'package:terra':
## 
##     depth
## 
## Adjuntando el paquete: 'vcd'
## The following object is masked from 'package:raster':
## 
##     mosaic
## The following objects are masked from 'package:terra':
## 
##     mosaic, sieve
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.4.1
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

1. Creating a Stack

Sentinel raster with bands 2 (blue), 3 (green), 4 (red) & 8 (NIR).

rast1 <- rast("WVP_10m.jp2")
rast2 <- rast("AOT_10m.jp2")
rast3 <-  rast("B02_10m.jp2")
rast4 <-  rast("B03_10m.jp2")
rast5 <-  rast("B04_10m.jp2")
rast6 <-  rast("B08_10m.jp2")
rast7 <-  rast("TCI_10m.jp2")

rast_fogo <- c(rast3,rast4,rast5,rast6) 
plot(rast_fogo)

Crop & Stack

# extension

crop_extent <- ext(c(765000,795000,1638000,1667000)) #xmin, xmax, ymin, ymax

# crop stack

fogo_crop <- crop(rast_fogo, crop_extent)
plot(fogo_crop$B08_10m)

writeRaster(fogo_crop, "fogo_crop.tif", overwrite=TRUE)

2. NDVI

True color composition

# true
plotRGB(fogo_crop, b =1,
        g =2,
        r = 3,
        stretch="lin")

# false

plotRGB(fogo_crop, b =1,
        g =2,
        r = 4, # NRI
        stretch="lin")

Calculate NDVI ndvi = (NIR - RED)/(NIR + RED)

ndvi = (fogo_crop[[4]] - fogo_crop[[3]]) / (fogo_crop[[4]] + fogo_crop[[3]])

plot(ndvi)

Other method: function for NDVI

NDVI <- function(nir, red){
  if (length(nir) != length(red)){
    stop("NIR and RED dont match")
  }
  
  ndvi <- (nir-red) / (nir+red)
  names(ndvi)<- "ndvi"
  
  ndvi
}
# Apply the function

ndvi_fog <-NDVI(fogo_crop$B08_10m, fogo_crop$B04_10m)

plot(ndvi_fog)

Unir NDVI con el stack

result <- c(fogo_crop, ndvi)

names(result) <- c("B02_may", "B03_may","B04_may", "B08_may","NDVI_may")
writeRaster(result, "fogo_ndvi2.tif", overwrite=TRUE, datatype = "FLT4S" )

3. Classification and Regresion with Random forest

sentinel_nov <- rast("sentinel2_20221127_ndvi.tif")
sentinel_may <- rast("fogo_ndvi2.tif")
# combine
sentinel <- c(sentinel_may, sentinel_nov)
# polygons

ref_data <- vect("reference1.gpkg")

table(ref_data$class)
## 
##                       Agriculture               Basaltic rock (old) 
##                                20                                19 
##             Basaltic rock (young)                            Forest 
##                                17                                15 
##                 Human settlements Land with little or no vegetation 
##                                16                                37 
##                             Ocean 
##                                16

we verify if it’s the same place in may and nov

# may
plotRGB(sentinel, b =1,
        g =2,
        r = 3,
        stretch="lin")

# nov
plotRGB(sentinel, b =6,
        g =7,
        r = 8,
        stretch="lin")

plot(ref_data, add=TRUE, col ="red")

Random forest

df_extract <- extract(sentinel, ref_data, na.rm = TRUE)
ref_data$ID <- seq(1, nrow( ref_data))
df_extract <- merge(df_extract, ref_data)


# create model 

rfmodel = randomForest(x = df_extract[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may",
                                 "B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
                       y = as.factor( df_extract[, c("class")] ), ntree = 100)

Criteria for classification:

# apply model

lcc <- predict(sentinel, rfmodel)
plot(lcc, col=c("orange", "brown", "black", "darkgreen","purple","yellowgreen","darkblue"))

4. Accuracy of the model

# Create a partition (70% training data, 30% test data)
trainIndex <- createDataPartition(df_extract$class, p = 0.7, list = FALSE)

# Split data into training and testing sets
trainData <- df_extract[trainIndex, ] #data que si esta viendo el modelo

testData  <- df_extract[-trainIndex, ] #data que no esta viendo el modelo
rfmodel2 = randomForest(x = trainData[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may",
                                           "B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
                       y = as.factor( trainData[, c("class")] ),
                       ntree = 100)

Apply model with traindata and testdata

lcc2 = predict(sentinel, rfmodel2)
## |---------|---------|---------|---------|=========================================                                          
plot(lcc2, col=c("orange", "brown", "black", "darkgreen","purple","yellowgreen","darkblue"))

writeRaster(lcc2, "fogo_model2.tif", overwrite = TRUE)

Accuracy with test data

# predict the classes of the independent test set
test_prediction = predict(rfmodel2, testData)

# confusion matrix
cfm = table(testData$class, test_prediction)
cfm
##                                    test_prediction
##                                     Agriculture Basaltic rock (old)
##   Agriculture                              1072                   3
##   Basaltic rock (old)                         7                3586
##   Basaltic rock (young)                       1                 106
##   Forest                                      3                   0
##   Human settlements                          10                 302
##   Land with little or no vegetation         133                 203
##   Ocean                                       0                   0
##                                    test_prediction
##                                     Basaltic rock (young) Forest
##   Agriculture                                           6      3
##   Basaltic rock (old)                                  51      0
##   Basaltic rock (young)                              5952      0
##   Forest                                                0   2168
##   Human settlements                                     6      0
##   Land with little or no vegetation                     1      9
##   Ocean                                                 1      0
##                                    test_prediction
##                                     Human settlements
##   Agriculture                                       5
##   Basaltic rock (old)                             283
##   Basaltic rock (young)                             1
##   Forest                                            0
##   Human settlements                              3952
##   Land with little or no vegetation               100
##   Ocean                                             0
##                                    test_prediction
##                                     Land with little or no vegetation Ocean
##   Agriculture                                                     292     0
##   Basaltic rock (old)                                             244     2
##   Basaltic rock (young)                                             2     1
##   Forest                                                           13     0
##   Human settlements                                               223     0
##   Land with little or no vegetation                              5227     0
##   Ocean                                                             0 19701
# accuracy: sum of diagonals divided by all
sum(diag(cfm))/sum(cfm)
## [1] 0.953949

Cross validation

# set up cross validation with 5 folds 
trc = trainControl(method = "cv", number = 5)


# tune model mtry
rfmodel_cv_may = caret::train(x = trainData[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may")],
                          y = trainData[, c("class")],
                          method = "rf",
                          trControl = trc,
                          ntree = 100,
                          tuneLength = 3) 

rfmodel_cv_nov = caret::train(x = trainData[,c("B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
                          y = trainData[, c("class")],
                          method = "rf",
                          trControl = trc,
                          ntree = 100,
                          tuneLength = 3)

rfmodel_cv_full = caret::train(x = trainData[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may",
                                              "B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
                          y = trainData[, c("class")],
                          method = "rf",
                          trControl = trc,
                          ntree = 100,
                          tuneLength = 3)
# predict the classes of the independet test set

test_prediction <- predict(rfmodel_cv_full, testData)

# confusion matrix

confusionMatrix(as.factor(testData$class), test_prediction, mode="everything")
## Confusion Matrix and Statistics
## 
##                                    Reference
## Prediction                          Agriculture Basaltic rock (old)
##   Agriculture                              1058                   3
##   Basaltic rock (old)                         7                3576
##   Basaltic rock (young)                       3                 114
##   Forest                                      4                   0
##   Human settlements                           9                 305
##   Land with little or no vegetation         143                 206
##   Ocean                                       0                   0
##                                    Reference
## Prediction                          Basaltic rock (young) Forest
##   Agriculture                                           7      2
##   Basaltic rock (old)                                  49      0
##   Basaltic rock (young)                              5941      0
##   Forest                                                0   2166
##   Human settlements                                     6      0
##   Land with little or no vegetation                     1     15
##   Ocean                                                 1      0
##                                    Reference
## Prediction                          Human settlements
##   Agriculture                                       5
##   Basaltic rock (old)                             283
##   Basaltic rock (young)                             2
##   Forest                                            0
##   Human settlements                              3949
##   Land with little or no vegetation                91
##   Ocean                                             0
##                                    Reference
## Prediction                          Land with little or no vegetation Ocean
##   Agriculture                                                     306     0
##   Basaltic rock (old)                                             256     2
##   Basaltic rock (young)                                             2     1
##   Forest                                                           14     0
##   Human settlements                                               224     0
##   Land with little or no vegetation                              5217     0
##   Ocean                                                             0 19701
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9528          
##                  95% CI : (0.9508, 0.9548)
##     No Information Rate : 0.4512          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9359          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Agriculture Class: Basaltic rock (old)
## Sensitivity                     0.86438                    0.85062
## Specificity                     0.99239                    0.98487
## Pos Pred Value                  0.76611                    0.85694
## Neg Pred Value                  0.99607                    0.98410
## Precision                       0.76611                    0.85694
## Recall                          0.86438                    0.85062
## F1                              0.81228                    0.85377
## Prevalence                      0.02803                    0.09627
## Detection Rate                  0.02423                    0.08189
## Detection Prevalence            0.03162                    0.09556
## Balanced Accuracy               0.92838                    0.91775
##                      Class: Basaltic rock (young) Class: Forest
## Sensitivity                                0.9893       0.99221
## Specificity                                0.9968       0.99957
## Pos Pred Value                             0.9799       0.99176
## Neg Pred Value                             0.9983       0.99959
## Precision                                  0.9799       0.99176
## Recall                                     0.9893       0.99221
## F1                                         0.9846       0.99199
## Prevalence                                 0.1375       0.04999
## Detection Rate                             0.1360       0.04960
## Detection Prevalence                       0.1388       0.05001
## Balanced Accuracy                          0.9931       0.99589
##                      Class: Human settlements
## Sensitivity                           0.91201
## Specificity                           0.98617
## Pos Pred Value                        0.87892
## Neg Pred Value                        0.99027
## Precision                             0.87892
## Recall                                0.91201
## F1                                    0.89516
## Prevalence                            0.09916
## Detection Rate                        0.09043
## Detection Prevalence                  0.10289
## Balanced Accuracy                     0.94909
##                      Class: Land with little or no vegetation Class: Ocean
## Sensitivity                                            0.8668       0.9998
## Specificity                                            0.9879       1.0000
## Pos Pred Value                                         0.9196       0.9999
## Neg Pred Value                                         0.9789       0.9999
## Precision                                              0.9196       0.9999
## Recall                                                 0.8668       0.9998
## F1                                                     0.8924       0.9999
## Prevalence                                             0.1378       0.4512
## Detection Rate                                         0.1195       0.4511
## Detection Prevalence                                   0.1299       0.4512
## Balanced Accuracy                                      0.9273       0.9999

Definition: The proportion of correctly classified instances among the total instances. Value: 0.9531 (95.31%) Interpretation: The model correctly classified 95.31% of all instances.

Definition: The range within which the true accuracy is expected to fall, with 95% confidence. Value: (0.9511, 0.9551) Interpretation: We are 95% confident that the true accuracy of the model lies between 95.11% and 95.51%.

Definition: The p-value testing whether the model’s accuracy is significantly better than the No Information Rate (NIR), which is the accuracy expected by chance. Value: < 2.2e-16 Interpretation: The model’s accuracy is highly significantly better than random chance, with a p-value much smaller than 0.05.

Definition: A measure of the agreement between predicted and actual classifications, adjusting for chance agreement. Value: 0.9364 Interpretation: There is a very high level of agreement between the predicted and actual classes, much better than random chance.

Definition: The proportion of correctly predicted positive instances among all predicted positives. Interpretation: Precision is calculated per class, for example: For “bare_ground”: 0.9117 (91.17%) means that when the model predicts “bare_ground,” it is correct 91.17% of the time. Precision values vary by class, reflecting the model’s reliability for each class prediction.

Definition: The proportion of correctly predicted positives among all actual positives. Interpretation: Recall is also calculated per class: For “bare_ground”: 0.8734 (87.34%) means that the model correctly identifies 87.34% of all actual “bare_ground” instances. Higher recall means fewer actual positives are missed.

Definition: The harmonic mean of precision and recall, providing a balanced measure that accounts for both false positives and false negatives. Interpretation: A high F1 score indicates a good balance between precision and recall.

5. Relations with terrain characteristics

# data
elevation<- rast("elevation_fogo.tif")

aspect <-rast("aspect_fogo.tif")

slope <- rast("slope_zona26.tif")

Extract values from the rasters

# Elevation 
lcc2_resampled <- resample(lcc2, elevation, method = "near")

values_df <- as.data.frame(c(lcc2_resampled, elevation), na.rm = TRUE)
colnames(values_df) <- c("land_cover", "elevation")

values_df$land_cover <- as.factor(values_df$land_cover)
values_df$land_cover_num <- as.numeric(as.factor(values_df$land_cover))

# Aspect

lcc3_resampled <- resample(lcc2, aspect, method = "near")

values_df2 <- as.data.frame(c(lcc3_resampled, aspect), na.rm = TRUE)
colnames(values_df2) <- c("land_cover", "aspect")

values_df2$land_cover <- as.factor(values_df2$land_cover)
values_df2$land_cover_num <- as.numeric(as.factor(values_df2$land_cover))

# Slope

lcc4_resampled <- resample(lcc2, slope, method = "near")

values_df3 <- as.data.frame(c(lcc4_resampled, slope), na.rm = TRUE)
colnames(values_df3) <- c("land_cover", "slope")

values_df3$land_cover <- as.factor(values_df3$land_cover)
values_df3$land_cover_num <- as.numeric(as.factor(values_df3$land_cover))

5.0 Normality

# Elevation

hist(values_df$elevation, main="Elevation", xlab="elevation")

# Aspect

hist(values_df2$aspect, main="Aspect", xlab="Aspect")

# Elevation

hist(values_df3$slope, main="Slope", xlab="Slope")

Because of the lack of normality in elevation and slope, we use a non-parametric method: Kruskal-Wallis, which is adequate to test a categorical variable with a continuous.It evaluates the median of the slope/elevation within 2 or more groups of land cover. it’s the non-parametric alternative for ANOVA.

For Aspect, because we have 2 categorical variables, we will use contingency tables, Cramer’s and Fisher’s test.

5.1. Elevation

# Kruskal test
kruskal_elevation <- kruskal.test(elevation ~ land_cover, data = values_df)
kruskal_elevation
## 
##  Kruskal-Wallis rank sum test
## 
## data:  elevation by land_cover
## Kruskal-Wallis chi-squared = 154.99, df = 6, p-value < 2.2e-16

P-value < 0.05, elevation can differ with the type of land cover. But to see which of the land use may be the one that has the strongest relation we can visualize it on a box-plot.

boxplot(elevation ~ land_cover, data = values_df, 
        xlab = "Land Cover", 
        ylab = "Elevation", 
        main = "Elevation & Land Cover",
        col = "lightblue", 
        las = 2)

Ocean and basaltic rock (old) may be the land cover types with the highest relation with elevation because of its small variability.

5.2. Aspect

# names to aspect
values_df2 <- values_df2 %>%
  mutate(aspect2 = case_when(
    aspect == 2 ~ "N",
    aspect == 3 ~ "NE",
    aspect == 4 ~ "E",
    aspect == 5 ~ "SE",
    aspect == 6 ~ "S",
    aspect == 7 ~ "SW",
    aspect == 8 ~ "W",
    aspect == 9 ~ "NW",
    aspect == 10 ~ "N",
    TRUE ~ as.character(aspect)
  ))

# contingency table
table_cont<- table(values_df2$land_cover, values_df2$aspect2)
filtered_table <- table_cont[rowSums(table_cont) > 0, colSums(table_cont) > 0]
# Correspondence analysis
ac <- CA(filtered_table, graph = FALSE)

summary(ac)
## 
## Call:
## CA(X = filtered_table, graph = FALSE) 
## 
## The chi square of independence between the two variables is equal to 235.7635 (p-value =  1.305044e-31 ).
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5
## Variance               0.298   0.163   0.063   0.034   0.017
## % of var.             51.783  28.325  10.936   5.970   2.987
## Cumulative % of var.  51.783  80.108  91.044  97.013 100.000
## 
## Rows
##                                     Iner*1000     Dim.1     ctr    cos2  
## Agriculture                       |   105.657 |   0.428   5.708   0.161 |
## Basaltic rock (old)               |   113.926 |   0.387   6.629   0.173 |
## Basaltic rock (young)             |    96.980 |   0.802  26.329   0.808 |
## Forest                            |   117.769 |   1.560  25.910   0.655 |
## Human settlements                 |    35.385 |   0.465   1.418   0.119 |
## Land with little or no vegetation |   105.316 |  -0.410  34.007   0.962 |
##                                     Dim.2     ctr    cos2     Dim.3     ctr
## Agriculture                        -0.836  39.742   0.613 |   0.488  35.077
## Basaltic rock (old)                 0.812  53.312   0.762 |   0.166   5.760
## Basaltic rock (young)              -0.007   0.003   0.000 |  -0.064   0.800
## Forest                             -0.438   3.729   0.052 |  -0.905  41.310
## Human settlements                   0.469   2.630   0.121 |   0.628  12.225
## Land with little or no vegetation  -0.040   0.584   0.009 |  -0.071   4.829
##                                      cos2  
## Agriculture                         0.209 |
## Basaltic rock (old)                 0.032 |
## Basaltic rock (young)               0.005 |
## Forest                              0.221 |
## Human settlements                   0.217 |
## Land with little or no vegetation   0.029 |
## 
## Columns
##                                     Iner*1000     Dim.1     ctr    cos2  
## E                                 |    84.466 |   0.711  12.025   0.424 |
## N                                 |    54.292 |   0.672  10.734   0.589 |
## NE                                |   131.165 |   1.341  38.272   0.869 |
## NW                                |    62.316 |   0.145   0.811   0.039 |
## S                                 |    61.464 |  -0.332   7.386   0.358 |
## SE                                |    41.844 |   0.273   1.893   0.135 |
## SW                                |    96.902 |  -0.597  28.630   0.880 |
## W                                 |    42.585 |  -0.067   0.249   0.017 |
##                                     Dim.2     ctr    cos2     Dim.3     ctr
## E                                   0.611  16.224   0.313 |   0.485  26.431
## N                                  -0.265   3.041   0.091 |  -0.351  13.844
## NE                                 -0.075   0.220   0.003 |  -0.473  22.527
## NW                                 -0.604  25.704   0.672 |   0.341  21.186
## S                                   0.394  19.016   0.504 |  -0.063   1.274
## SE                                  0.641  19.099   0.743 |   0.118   1.660
## SW                                 -0.047   0.320   0.005 |  -0.151   8.703
## W                                  -0.401  16.375   0.626 |   0.129   4.375
##                                      cos2  
## E                                   0.197 |
## N                                   0.160 |
## NE                                  0.108 |
## NW                                  0.214 |
## S                                   0.013 |
## SE                                  0.025 |
## SW                                  0.056 |
## W                                   0.065 |
# plot correspondence analysis
fviz_ca_biplot(ac, repel = TRUE)

p-value <0.05 = significant relation between the 2 variables

DIM 1:

On the right side of Dim.1: Categories such as “Basaltic rock (young)”, “Forest”, and columns like “NE” are found, suggesting an association of these areas with more natural and geologically young environments.

On the left side of Dim.1: Categories like “Land with little or no vegetation” are grouped, which is associated with arid or sparsely vegetated terrains, along with directions like “W” and “SW.”

DIM 2:

Above Dim.2: “Human settlements” and “Basaltic rock (old)” are located.

Below Dim.2: “Agriculture” is found, implying that this dimension also distinguishes predominantly agricultural areas from other categories. “NW” and “S” are near “Agriculture,” which could indicate some relationship or geographical proximity to cultivated areas.

Dim.1 seems to represent a gradient of vegetation, ranging from arid or sparsely vegetated areas to densely forested areas or young basaltic rocks.

Dim.2 captures a gradient of human intervention and geological age.

5.3. Slope

# Kruskal test
kruskal_slope <- kruskal.test(slope ~ land_cover, data = values_df3)
kruskal_slope
## 
##  Kruskal-Wallis rank sum test
## 
## data:  slope by land_cover
## Kruskal-Wallis chi-squared = 51.688, df = 6, p-value = 2.154e-09

P-value < 0.05, slope can differ with the type of land cover. But to see which of the land use may be the one that has the strongest relation we can visualize it on a box-plot.

boxplot(slope ~ land_cover, data = values_df3, 
        xlab = "Land Cover", 
        ylab = "Slope", 
        main = "Slope & Land Cover",
        col = "lightblue", 
        las = 2)

Forest may be the land cover type with the highest relation with slope because of its small variability.